package org.apache.lucene.search;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.index.TermDocs; // for javadocs

/**
 * A {@link Filter} that only accepts documents whose single
 * term value in the specified field is contained in the
 * provided set of allowed terms.
 * 
 * <p/>
 * 
 * This is the same functionality as TermsFilter (from
 * contrib/queries), except this filter requires that the
 * field contains only a single term for all documents.
 * Because of drastically different implementations, they
 * also have different performance characteristics, as
 * described below.
 * 
 * <p/>
 * 
 * The first invocation of this filter on a given field will
 * be slower, since a {@link FieldCache.StringIndex} must be
 * created.  Subsequent invocations using the same field
 * will re-use this cache.  However, as with all
 * functionality based on {@link FieldCache}, persistent RAM
 * is consumed to hold the cache, and is not freed until the
 * {@link IndexReader} is closed.  In contrast, TermsFilter
 * has no persistent RAM consumption.
 * 
 * 
 * <p/>
 * 
 * With each search, this filter translates the specified
 * set of Terms into a private {@link FixedBitSet} keyed by
 * term number per unique {@link IndexReader} (normally one
 * reader per segment).  Then, during matching, the term
 * number for each docID is retrieved from the cache and
 * then checked for inclusion using the {@link FixedBitSet}.
 * Since all testing is done using RAM resident data
 * structures, performance should be very fast, most likely
 * fast enough to not require further caching of the
 * DocIdSet for each possible combination of terms.
 * However, because docIDs are simply scanned linearly, an
 * index with a great many small documents may find this
 * linear scan too costly.
 * 
 * <p/>
 * 
 * In contrast, TermsFilter builds up an {@link FixedBitSet},
 * keyed by docID, every time it's created, by enumerating
 * through all matching docs using {@link TermDocs} to seek
 * and scan through each term's docID list.  While there is
 * no linear scan of all docIDs, besides the allocation of
 * the underlying array in the {@link FixedBitSet}, this
 * approach requires a number of "disk seeks" in proportion
 * to the number of terms, which can be exceptionally costly
 * when there are cache misses in the OS's IO cache.
 * 
 * <p/>
 * 
 * Generally, this filter will be slower on the first
 * invocation for a given field, but subsequent invocations,
 * even if you change the allowed set of Terms, should be
 * faster than TermsFilter, especially as the number of
 * Terms being matched increases.  If you are matching only
 * a very small number of terms, and those terms in turn
 * match a very small number of documents, TermsFilter may
 * perform faster.
 *
 * <p/>
 *
 * Which filter is best is very application dependent.
 */

public class FieldCacheTermsFilter extends Filter {
    private String field;
    private String[] terms;

    public FieldCacheTermsFilter(String field, String... terms) {
        this.field = field;
        this.terms = terms;
    }

    public FieldCache getFieldCache() {
        return FieldCache.DEFAULT;
    }

    @Override
    public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
    }

    protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
        private FieldCache.StringIndex fcsi;

        private FixedBitSet bits;

        public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
            this.fcsi = fcsi;
            bits = new FixedBitSet(this.fcsi.lookup.length);
            for (int i = 0; i < terms.length; i++) {
                int termNumber = this.fcsi.binarySearchLookup(terms[i]);
                if (termNumber > 0) {
                    bits.set(termNumber);
                }
            }
        }

        @Override
        public DocIdSetIterator iterator() {
            return new FieldCacheTermsFilterDocIdSetIterator();
        }

        /** This DocIdSet implementation is cacheable. */
        @Override
        public boolean isCacheable() {
            return true;
        }

        protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
            private int doc = -1;

            @Override
            public int docID() {
                return doc;
            }

            @Override
            public int nextDoc() {
                try {
                    while (!bits.get(fcsi.order[++doc])) {
                    }
                } catch (ArrayIndexOutOfBoundsException e) {
                    doc = NO_MORE_DOCS;
                }
                return doc;
            }

            @Override
            public int advance(int target) {
                try {
                    doc = target;
                    while (!bits.get(fcsi.order[doc])) {
                        doc++;
                    }
                } catch (ArrayIndexOutOfBoundsException e) {
                    doc = NO_MORE_DOCS;
                }
                return doc;
            }
        }
    }
}
